pip install lime
Collecting lime
Downloading lime-0.2.0.1.tar.gz (275 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 275.7/275.7 kB 4.3 MB/s eta 0:00:00
Preparing metadata (setup.py) ... done
Requirement already satisfied: matplotlib in /usr/local/lib/python3.10/dist-packages (from lime) (3.7.1)
Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from lime) (1.23.5)
Requirement already satisfied: scipy in /usr/local/lib/python3.10/dist-packages (from lime) (1.10.1)
Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from lime) (4.66.1)
Requirement already satisfied: scikit-learn>=0.18 in /usr/local/lib/python3.10/dist-packages (from lime) (1.2.2)
Requirement already satisfied: scikit-image>=0.12 in /usr/local/lib/python3.10/dist-packages (from lime) (0.19.3)
Requirement already satisfied: networkx>=2.2 in /usr/local/lib/python3.10/dist-packages (from scikit-image>=0.12->lime) (3.1)
Requirement already satisfied: pillow!=7.1.0,!=7.1.1,!=8.3.0,>=6.1.0 in /usr/local/lib/python3.10/dist-packages (from scikit-image>=0.12->lime) (9.4.0)
Requirement already satisfied: imageio>=2.4.1 in /usr/local/lib/python3.10/dist-packages (from scikit-image>=0.12->lime) (2.31.1)
Requirement already satisfied: tifffile>=2019.7.26 in /usr/local/lib/python3.10/dist-packages (from scikit-image>=0.12->lime) (2023.8.12)
Requirement already satisfied: PyWavelets>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from scikit-image>=0.12->lime) (1.4.1)
Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from scikit-image>=0.12->lime) (23.1)
Requirement already satisfied: joblib>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=0.18->lime) (1.3.2)
Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=0.18->lime) (3.2.0)
Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->lime) (1.1.0)
Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.10/dist-packages (from matplotlib->lime) (0.11.0)
Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->lime) (4.42.0)
Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->lime) (1.4.4)
Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->lime) (3.1.1)
Requirement already satisfied: python-dateutil>=2.7 in /usr/local/lib/python3.10/dist-packages (from matplotlib->lime) (2.8.2)
Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.7->matplotlib->lime) (1.16.0)
Building wheels for collected packages: lime
Building wheel for lime (setup.py) ... done
Created wheel for lime: filename=lime-0.2.0.1-py3-none-any.whl size=283835 sha256=a0125bd8374e83ce8d08e854fe1fc921dd36d2600d4e162251655b427f37d124
Stored in directory: /root/.cache/pip/wheels/fd/a2/af/9ac0a1a85a27f314a06b39e1f492bee1547d52549a4606ed89
Successfully built lime
Installing collected packages: lime
Successfully installed lime-0.2.0.1
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error,r2_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from mpl_toolkits.mplot3d import Axes3D
import xgboost as xgb
from lime import lime_tabular
from sklearn.model_selection import RandomizedSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import ElasticNetCV
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import cross_val_predict
# Loading CSV file
file_path = '/content/slice_localization_data.csv'
df = pd.read_csv(file_path)
df.head()
| patientId | value0 | value1 | value2 | value3 | value4 | value5 | value6 | value7 | value8 | ... | value375 | value376 | value377 | value378 | value379 | value380 | value381 | value382 | value383 | reference | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | -0.25 | -0.25 | -0.25 | ... | -0.25 | 0.980381 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | -0.25 | -0.25 | 21.803851 |
| 1 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | -0.25 | -0.25 | -0.25 | ... | -0.25 | 0.977008 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | -0.25 | -0.25 | 21.745726 |
| 2 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | -0.25 | -0.25 | -0.25 | ... | -0.25 | 0.977008 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | -0.25 | -0.25 | 21.687600 |
| 3 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | -0.25 | -0.25 | -0.25 | ... | -0.25 | 0.977008 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | -0.25 | -0.25 | 21.629474 |
| 4 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | -0.25 | -0.25 | -0.25 | ... | -0.25 | 0.976833 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | -0.25 | -0.25 | 21.571348 |
5 rows × 386 columns
dataset_shape = df.shape
print("Dataset shape:", dataset_shape)
Dataset shape: (53500, 386)
# Checking and printing of numerical and categorical columns
categorical_columns = df.select_dtypes(include='object').columns
numerical_columns = df.select_dtypes(include=['int64', 'float64']).columns
categorical_count = len(categorical_columns)
numerical_count = len(numerical_columns)
print("Categorical columns:", categorical_columns)
print("Numerical columns:", numerical_columns)
print("Number of categorical columns:", categorical_count)
print("Number of numerical columns:", numerical_count)
Categorical columns: Index([], dtype='object')
Numerical columns: Index(['patientId', 'value0', 'value1', 'value2', 'value3', 'value4', 'value5',
'value6', 'value7', 'value8',
...
'value375', 'value376', 'value377', 'value378', 'value379', 'value380',
'value381', 'value382', 'value383', 'reference'],
dtype='object', length=386)
Number of categorical columns: 0
Number of numerical columns: 386
# Check for null values
null_counts = df.isnull().sum()
print("Columns with null values:")
for column, count in null_counts.items():
if count > 0:
print(f"{column}: {count} null values")
Columns with null values:
From the above dataset, we can see that variables named as ‘value0’, ‘value1’,.. ‘value383’ contain feature values of CT scan images for each patient. The last variable is ‘reference’. This ‘reference’ is our target variable and it contains the relative location of the CT slice.
df.describe(include='all')
| patientId | value0 | value1 | value2 | value3 | value4 | value5 | value6 | value7 | value8 | ... | value375 | value376 | value377 | value378 | value379 | value380 | value381 | value382 | value383 | reference | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 53500.000000 | 53500.000000 | 53500.000000 | 53500.000000 | 53500.000000 | 53500.000000 | 53500.000000 | 53500.000000 | 53500.000000 | 53500.000000 | ... | 53500.000000 | 53500.000000 | 53500.000000 | 53500.000000 | 53500.000000 | 53500.000000 | 53500.000000 | 53500.000000 | 53500.000000 | 53500.000000 |
| mean | 47.075701 | 0.059627 | 0.071558 | 0.145819 | 0.218728 | 0.274762 | 0.276189 | 0.204531 | 0.062281 | -0.042025 | ... | -0.029404 | 0.182913 | 0.320112 | 0.359373 | 0.342889 | 0.266091 | 0.083049 | -0.031146 | -0.154524 | 47.028039 |
| std | 27.414240 | 0.174243 | 0.196921 | 0.300270 | 0.359163 | 0.378862 | 0.369605 | 0.351294 | 0.292232 | 0.268391 | ... | 0.085817 | 0.383333 | 0.463517 | 0.478188 | 0.471811 | 0.437633 | 0.279734 | 0.098738 | 0.122491 | 22.347042 |
| min | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | -0.250000 | -0.250000 | -0.250000 | -0.250000 | ... | -0.250000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | -0.250000 | -0.250000 | -0.250000 | 1.738733 |
| 25% | 23.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | -0.250000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | -0.250000 | 29.891607 |
| 50% | 46.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | -0.250000 | 43.987893 |
| 75% | 70.000000 | 0.000000 | 0.000000 | 0.000000 | 0.446429 | 0.684477 | 0.662382 | 0.441412 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.996286 | 0.999677 | 0.999560 | 0.949478 | 0.000000 | 0.000000 | 0.000000 | 63.735059 |
| max | 96.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.998790 | 0.996468 | 0.999334 | 1.000000 | 1.000000 | ... | 0.961279 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.999857 | 0.996839 | 0.942851 | 97.489115 |
8 rows × 386 columns
# Getting unqiue values of the "Patient ID"
np.unique(df["patientId"])
array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96])
print(df.columns)
Index(['patientId', 'value0', 'value1', 'value2', 'value3', 'value4', 'value5',
'value6', 'value7', 'value8',
...
'value375', 'value376', 'value377', 'value378', 'value379', 'value380',
'value381', 'value382', 'value383', 'reference'],
dtype='object', length=386)
# Dropping off unnecessary variable ‘patientId’, separating features and target variables.
df_copy = df.drop(['patientId'], axis=1)
df_y = df_copy['reference']
df_x = df_copy.drop(['reference'], axis=1)
# Plotting of target variable distribution
plt.figure(figsize=(12, 8))
sns.distplot(df_y, bins=100)
plt.title("Distribution of Target Variable (Reference)")
plt.xlabel("Reference Value")
plt.ylabel("Frequency")
plt.show()
<ipython-input-13-711025c69005>:3: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(df_y, bins=100)
# Plot the distribution of means for each column in the dataset (excluding the first and last columns)
plt.figure(figsize=(8, 6))
sns.distplot(df_x.mean())
plt.title("Distribution of Means for Each Attribute")
plt.xlabel("Mean Value")
plt.ylabel("Density")
plt.show()
# Explanation: This plot helps us understand the range and distribution of the means of different attributes in the dataset. It's a useful step to take before deciding whether to apply feature scaling techniques.
# ===> We can conclude all of our attributes are in [-1,1] range, so we don't need to use feature normalize technique
<ipython-input-14-aa548682aa42>:3: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(df_x.mean())
# Convert the DataFrame to a NumPy array for plotting
X = df_x.values
# Create the figure and axes for the polar plots
fig = plt.figure(figsize=(12, 10))
axes1 = fig.add_axes([0, 2, 1, 1], projection='polar')
axes2 = fig.add_axes([1, 2, 1, 1], projection='polar')
axes3 = fig.add_axes([0, 1, 1, 1], projection='polar')
axes4 = fig.add_axes([1, 1, 1, 1], projection='polar')
# Plotting first example of bone structure of the first person
axes1.plot(X[150, 1:241], 'bo', ms=10, label='Patient 1')
axes1.set_xlabel("Bone Structure")
axes1.legend()
# Plotting first example of air inclusion of the first person
axes2.plot(X[150, 241:386], 'ro', ms=10, label='Patient 1')
axes2.set_xlabel("Air Inclusion")
axes2.legend()
# Plotting second example of bone structure of the second person
axes3.plot(X[3541, 1:241], 'bo', ms=10, label='Patient 2')
axes3.set_xlabel("Bone Structure")
axes3.legend()
# Plotting second example of air inclusion of the second person
axes4.plot(X[3541, 241:386], 'ro', ms=10, label='Patient 2')
axes4.set_xlabel("Air Inclusion")
axes4.legend()
plt.show()
Split the Data
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.3, random_state=42)
# View the split data
print("X_train:")
print(X_train.head())
print("\nX_test:")
print(X_test.head())
print("\ny_train:")
print(y_train.head())
print("\ny_test:")
print(y_test.head())
X_train:
value0 value1 value2 value3 value4 value5 value6 \
19113 0.000000 0.000000 0.000000 0.000000 0.939018 0.965932 0.873580
40279 0.114286 0.020930 0.000000 0.963090 0.680756 0.558228 0.439762
42189 0.000000 0.000000 0.000000 0.751979 0.000000 0.842081 0.904479
30994 0.000000 0.000000 0.914286 0.855310 0.893836 0.000000 0.000000
19373 0.000000 0.267206 0.904605 0.972478 0.856772 0.000000 0.000000
value7 value8 value9 ... value374 value375 value376 \
19113 0.000000 0.000000 -0.25 ... 0.0 0.0 0.000000
40279 0.898336 0.741252 0.00 ... 0.0 0.0 0.000000
42189 0.943744 0.699692 0.00 ... 0.0 0.0 0.994347
30994 0.000000 0.000000 -0.25 ... 0.0 0.0 0.000000
19373 0.000000 0.000000 -0.25 ... 0.0 0.0 0.000000
value377 value378 value379 value380 value381 value382 value383
19113 0.000000 0.000000 0.000000 0.0 0.0 0.0 0.0
40279 0.000000 0.000000 0.000000 0.0 0.0 0.0 0.0
42189 0.999256 0.999908 0.997149 0.0 0.0 0.0 0.0
30994 0.000000 0.000000 0.000000 0.0 0.0 0.0 0.0
19373 0.000000 0.000000 0.000000 0.0 0.0 0.0 0.0
[5 rows x 384 columns]
X_test:
value0 value1 value2 value3 value4 value5 value6 \
46266 0.00000 0.000000 0.000000 0.000000 0.000000 0.704285 0.0
2778 0.86783 0.898305 0.169761 0.411677 0.950726 0.000000 0.0
34408 0.00000 0.000000 0.000000 0.000000 0.000000 0.000000 0.0
13871 0.00000 0.000000 0.000000 0.000000 0.797264 0.390216 0.0
35994 0.00000 0.000000 0.000000 0.000000 0.000000 0.000000 0.0
value7 value8 value9 ... value374 value375 value376 value377 \
46266 0.00 -0.25 -0.25 ... 0.0 0.0 0.0 0.797007
2778 -0.25 -0.25 -0.25 ... 0.0 0.0 0.0 0.000000
34408 0.00 -0.25 -0.25 ... 0.0 0.0 0.0 0.999131
13871 0.00 0.00 -0.25 ... 0.0 0.0 0.0 0.000000
35994 0.00 -0.25 -0.25 ... 0.0 0.0 0.0 0.000000
value378 value379 value380 value381 value382 value383
46266 0.999385 0.999884 0.999754 0.0 0.00 -0.25
2778 0.988959 0.985753 0.000000 0.0 -0.25 -0.25
34408 0.999918 0.999959 0.999382 0.0 0.00 -0.25
13871 0.000000 0.000000 0.000000 0.0 0.00 -0.25
35994 0.000000 0.000000 0.000000 0.0 0.00 -0.25
[5 rows x 384 columns]
y_train:
19113 91.005317
40279 26.270760
42189 33.807090
30994 79.645955
19373 81.673863
Name: reference, dtype: float64
y_test:
46266 43.349961
2778 12.119149
34408 39.070125
13871 88.928088
35994 60.022506
Name: reference, dtype: float64
# Plot the distribution of the target variable before splitting
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.hist(df_y, bins=30, color='blue', alpha=0.7)
plt.title("Distribution of Target Variable (Reference) - Before Split")
plt.xlabel("Reference Value")
plt.ylabel("Frequency")
# Plot the distribution of the target variable after splitting
plt.subplot(1, 2, 2)
plt.hist(y_train, bins=30, color='green', alpha=0.7, label='Train')
plt.hist(y_test, bins=30, color='orange', alpha=0.7, label='Test')
plt.title("Distribution of Target Variable (Reference) - After Split")
plt.xlabel("Reference Value")
plt.ylabel("Frequency")
plt.legend()
plt.tight_layout()
plt.show()
# Intializing Linear Regression Model
lm = LinearRegression()
lm.fit(X_train, y_train)
LinearRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LinearRegression()
# Evaluation on test set
y_pred_linear_test = lm.predict(X_test)
mse = mean_squared_error(y_test, y_pred_linear_test)
mae = mean_absolute_error(y_test, y_pred_linear_test)
r2 = r2_score(y_test, y_pred_linear_test)
print('Test Set Mean Squared Error:', mse)
print('Test Set Mean Absolute Error:', mae)
print('Test Set R-squared:', r2)
Test Set Mean Squared Error: 68.29434562989341 Test Set Mean Absolute Error: 6.123343272084307 Test Set R-squared: 0.8624473194580933
# Evaluation on training set
y_pred_linear_train = lm.predict(X_train)
mse = mean_squared_error(y_train, y_pred_linear_train)
mae = mean_absolute_error(y_train, y_pred_linear_train)
r2 = r2_score(y_train, y_pred_linear_train)
print('Train Set Mean Squared Error:', mse)
print('Train Set Mean Absolute Error:', mae)
print('Train Set R-squared:', r2)
Train Set Mean Squared Error: 67.82275427597651 Train Set Mean Absolute Error: 6.10251339719793 Train Set R-squared: 0.8645188670291543
# Define the function to plot learning curves
def plotLearningCurves(X, y, step):
m, n = X.shape
maxVal = (int)(m / 10) * 10
N_size_arr = np.arange(10, maxVal + 10, step)
error_arr = np.zeros((len(N_size_arr), 2)) # Updated line
index = 0
# Fitting Model
lm.fit(X, y)
# Increasing train dataset size, "step" times in each iteration
for i in N_size_arr:
# Splitting Training dataset with size i into train and cross-validation sets
X_train_subset = X_train[:i]
y_train_subset = y_train[:i]
# Computing both mean squared error of the training dataset and cross-validation datasets predictions
error_arr[index, 0] = mean_squared_error(y_train_subset, lm.predict(X_train_subset))
error_arr[index, 1] = mean_squared_error(y_test, lm.predict(X_test))
# Increasing index by 1
index += 1
# Initializing the figure
fig = plt.figure(figsize=(12, 8))
ax = fig.add_axes([0, 0, 1, 1])
ax.set_yscale('log')
# Plotting "Training set size" vs. "Mean Squared Error" for both the training and cross-validation dataset's errors
line1, = ax.plot(N_size_arr, error_arr[:, 0], c='red')
line2, = ax.plot(N_size_arr, error_arr[:, 1], c='blue')
# Adding labels and legends to our plot
ax.set_xlabel("N (Training set size)")
ax.set_ylabel("Mean Squared Error")
ax.legend((line1, line2), ("Train Error", "Test Error"))
# Call the function to plot the learning curves
plotLearningCurves(X_train, y_train, 200)
# Predecting Reference values with the test dataset
y_pred = lm.predict(X_test)
# Plotting predictions vs. y_test
fig = plt.figure(figsize=(8, 5))
ax = fig.add_axes([0, 0, 1, 1])
ax.set_xlabel("Predictions")
ax.set_ylabel("Test Target Variable")
ax.plot(y_test, y_pred, 'bo', ms=1)
# Display the plot
plt.show()
Based on the above information, it appears that the model might be slightly overfitting. Overfitting occurs when a model learns the training data too well and performs poorly on unseen data.
The mean squared error (MSE) and mean absolute error (MAE) on the training dataset are slightly lower than on the test dataset. Additionally, the R-squared value on the training dataset is higher than on the test dataset.
High complexity model: Overfitting can occur when the model is too complex relative to the available data. With a large number of features (386) compared to the number of instances (53500), it's possible that the model has learned noise or irrelevant patterns in the training data.
Feature Standarization
scaler = StandardScaler()
scaled_df_x = scaler.fit_transform(df_x)
# Create a PCA (Principal Component Analysis) instance with desired variance explained threshold
pca = PCA(0.75)
pca_vectors = pca.fit_transform(scaled_df_x)
for index, var in enumerate(pca.explained_variance_ratio_):
print("Explained Variance ratio by Principal Component ", (index+1), " : ", var)
Explained Variance ratio by Principal Component 1 : 0.14855715257210392 Explained Variance ratio by Principal Component 2 : 0.12108312979894122 Explained Variance ratio by Principal Component 3 : 0.06474442177698231 Explained Variance ratio by Principal Component 4 : 0.03774984839961066 Explained Variance ratio by Principal Component 5 : 0.03510333274248252 Explained Variance ratio by Principal Component 6 : 0.025645225528655405 Explained Variance ratio by Principal Component 7 : 0.02330954175071417 Explained Variance ratio by Principal Component 8 : 0.021428275145390387 Explained Variance ratio by Principal Component 9 : 0.017614067903705406 Explained Variance ratio by Principal Component 10 : 0.01589391279292687 Explained Variance ratio by Principal Component 11 : 0.014032726489128131 Explained Variance ratio by Principal Component 12 : 0.012871921022909665 Explained Variance ratio by Principal Component 13 : 0.0121634919283602 Explained Variance ratio by Principal Component 14 : 0.01072092397679328 Explained Variance ratio by Principal Component 15 : 0.009760345345618412 Explained Variance ratio by Principal Component 16 : 0.009482151161428901 Explained Variance ratio by Principal Component 17 : 0.008695231697355841 Explained Variance ratio by Principal Component 18 : 0.008205878792193493 Explained Variance ratio by Principal Component 19 : 0.008041977514589788 Explained Variance ratio by Principal Component 20 : 0.007543652519555233 Explained Variance ratio by Principal Component 21 : 0.006931981984392693 Explained Variance ratio by Principal Component 22 : 0.00646707766389355 Explained Variance ratio by Principal Component 23 : 0.006083596311372858 Explained Variance ratio by Principal Component 24 : 0.005868832669382626 Explained Variance ratio by Principal Component 25 : 0.005741237892731493 Explained Variance ratio by Principal Component 26 : 0.005545363955500639 Explained Variance ratio by Principal Component 27 : 0.0053812466975905315 Explained Variance ratio by Principal Component 28 : 0.005160938036779552 Explained Variance ratio by Principal Component 29 : 0.005084092580391446 Explained Variance ratio by Principal Component 30 : 0.004799621223915218 Explained Variance ratio by Principal Component 31 : 0.004652832065005538 Explained Variance ratio by Principal Component 32 : 0.004574687218755086 Explained Variance ratio by Principal Component 33 : 0.004478718115698722 Explained Variance ratio by Principal Component 34 : 0.004312607244892595 Explained Variance ratio by Principal Component 35 : 0.004197561850896129 Explained Variance ratio by Principal Component 36 : 0.004096225788266026 Explained Variance ratio by Principal Component 37 : 0.004050004696179025 Explained Variance ratio by Principal Component 38 : 0.0040203530745072285 Explained Variance ratio by Principal Component 39 : 0.0039032597942033425 Explained Variance ratio by Principal Component 40 : 0.0037259511080387207 Explained Variance ratio by Principal Component 41 : 0.0036811022474497667 Explained Variance ratio by Principal Component 42 : 0.003532807590921863 Explained Variance ratio by Principal Component 43 : 0.0035221449980117033 Explained Variance ratio by Principal Component 44 : 0.0033683692074793284 Explained Variance ratio by Principal Component 45 : 0.003324213440610176 Explained Variance ratio by Principal Component 46 : 0.0032417350414558144 Explained Variance ratio by Principal Component 47 : 0.003097464371346726 Explained Variance ratio by Principal Component 48 : 0.0030493180022711022 Explained Variance ratio by Principal Component 49 : 0.0030174010370553972 Explained Variance ratio by Principal Component 50 : 0.002893053408362604 Explained Variance ratio by Principal Component 51 : 0.002831606959181239 Explained Variance ratio by Principal Component 52 : 0.002775687439238411
we are able to reduce dimensions from 384 to 52
# Calculate the explained variance ratio
explained_variance_ratio = pca.explained_variance_ratio_
# Calculate the cumulative explained variance
cumulative_variance = np.cumsum(explained_variance_ratio)
# Plot the explained variance ratio and cumulative explained variance
plt.figure(figsize=(10, 6))
plt.plot(range(1, len(explained_variance_ratio) + 1), explained_variance_ratio, marker='o', label='Explained Variance Ratio')
plt.plot(range(1, len(explained_variance_ratio) + 1), cumulative_variance, marker='o', label='Cumulative Explained Variance')
plt.xlabel('Number of Principal Components')
plt.ylabel('Explained Variance Ratio / Cumulative Explained Variance')
plt.title('Explained Variance Ratio and Cumulative Explained Variance')
plt.legend()
plt.show()
pca_vectors = pca.fit_transform(scaled_df_x)
# Access and print the pca_vectors
print("PCA Vectors:")
print(pca_vectors)
PCA Vectors: [[ 1.66418097e+01 -5.28637854e+00 5.90813596e+00 ... 1.47466546e+00 -6.07350165e-03 7.87119195e-01] [ 1.65943085e+01 -4.94762736e+00 6.12864705e+00 ... 1.18524823e+00 -1.03928883e-01 1.02606483e+00] [ 1.65927230e+01 -4.91301981e+00 6.17570643e+00 ... 1.15494987e+00 -1.00923531e-01 8.95570094e-01] ... [-4.79214559e+00 1.42699675e+01 5.57219567e-01 ... -7.52523437e-01 1.66459189e-01 7.02028650e-02] [ 1.73465338e+01 -3.01729764e+00 6.15848484e+00 ... -5.72477971e-01 -1.88995484e-01 1.73028394e+00] [ 1.75212840e+01 -2.49155501e+00 5.31833961e+00 ... -4.57566760e-01 -3.26204443e-01 1.49666375e+00]]
# Select a subset of principal components
subset_pca = pca_vectors[:, :5] # Adjust the number of components as desired
# Create a DataFrame with the subset of principal components
df_subset_pca = pd.DataFrame(subset_pca, columns=[f'Principal Component {i+1}' for i in range(subset_pca.shape[1])])
# Create a scatter plot matrix
sns.set(style='ticks')
sns.pairplot(df_subset_pca)
plt.suptitle('Scatter Plot Matrix of Subset Principal Components')
plt.show()
sns.set(style="ticks")
sns.set_palette(palette='Set1')
fig_1 = plt.figure(figsize=(18,6))
sns.set(style="ticks")
sns.set_palette(palette='Set1')
sns.regplot(x=pca_vectors[:,0],y=df_y, label='Principal Component 1',x_bins=10)
sns.regplot(x=pca_vectors[:,1],y=df_y, label='Principal Component 2',x_bins=10)
sns.regplot(x=pca_vectors[:,2],y=df_y, label='Principal Component 3',x_bins=10)
plt.title('Most Important Principal Components vs Reference Value')
plt.xlabel('Principal Component Value')
plt.ylabel('Reference Value')
plt.legend()
plt.show()
pca = PCA(n_components=3)
X_train_pca = pca.fit_transform(X_train)
# Create a 3D plot
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')
# Plot the target variable (reference) as color gradient
sc = ax.scatter(pca_vectors[:, 0], pca_vectors[:, 1], df_y, c=df_y, cmap='viridis')
# Add labels and title
ax.set_xlabel('Principal Component 1')
ax.set_ylabel('Principal Component 2')
ax.set_zlabel('Reference')
ax.set_title('3D Plot: Reference vs. Principal Components 1 and 2')
# Add color bar
cbar = fig.colorbar(sc)
cbar.set_label('Reference')
# Show the plot
plt.show()
# spliting data to training and testing
train_x, test_x, train_y, test_y = train_test_split(pca_vectors, df_y, test_size=0.3, random_state=42)
# Initializing Linear Regression model
lm_pca = LinearRegression()
lm_pca.fit(train_x, train_y)
LinearRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LinearRegression()
# Evaluation on testing set
y_pred_linear_test_pca = lm_pca.predict(test_x)
mse = mean_squared_error(test_y, y_pred_linear_test_pca)
mae = mean_absolute_error(test_y, y_pred_linear_test_pca)
linear_r2 = r2_score(test_y, y_pred_linear_test_pca)
print('Test Set Mean Squared Error:', mse)
print('Test Set Mean Absolute Error:', mae)
print('Test Set R-squared:', linear_r2)
Test Set Mean Squared Error: 87.75937220286849 Test Set Mean Absolute Error: 6.881495867430069 Test Set R-squared: 0.8232425133026595
# Evaluation on training set
y_pred_linear_train_pca = lm_pca.predict(train_x)
mse = mean_squared_error(train_y, y_pred_linear_train_pca)
mae = mean_absolute_error(train_y, y_pred_linear_train_pca)
r2 = r2_score(y_train, y_pred_linear_train_pca)
print('Train Set Mean Squared Error:', mse)
print('Train Set Mean Absolute Error:', mae)
print('Train Set R-squared:', r2)
Train Set Mean Squared Error: 89.76536510098475 Train Set Mean Absolute Error: 6.93228434511436 Train Set R-squared: 0.820686825605212
# Predict the target variable on the test set
y_test_pred = lm_pca.predict(test_x)
# Plot the predictions against the actual reference values
plt.figure(figsize=(10, 6))
plt.scatter(test_y, y_test_pred, color='blue', alpha=0.5)
plt.plot([min(test_y), max(test_y)], [min(test_y), max(test_y)], color='red', linestyle='--')
plt.xlabel('Actual Reference Values')
plt.ylabel('Predicted Reference Values')
plt.title('Actual vs. Predicted Reference Values (Test Set)')
plt.show()
# Create a Ridge regression model
ridge = Ridge(alpha=0.3)
ridge.fit(train_x, train_y)
Ridge(alpha=0.3)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
Ridge(alpha=0.3)
# Evaluate the model on the testing set
y_pred_ridge_test = ridge.predict(test_x)
mse = mean_squared_error(test_y, y_pred_ridge_test)
mae = mean_absolute_error(test_y, y_pred_ridge_test)
ridge_r2 = r2_score(test_y, y_pred_ridge_test)
print('Test Set Mean Squared Error:', mse)
print('Test Set Mean Absolute Error:', mae)
print('Test Set R-squared:', ridge_r2)
Test Set Mean Squared Error: 87.75936425110947 Test Set Mean Absolute Error: 6.881494004018261 Test Set R-squared: 0.8232425293184183
# Predict the target variable on the training set
y_pred_ridge_train = ridge.predict(train_x)
# Calculate evaluation metrics on the training set
train_mse = mean_squared_error(train_y, y_pred_ridge_train)
train_mae = mean_absolute_error(train_y, y_pred_ridge_train)
train_r2 = r2_score(train_y, y_pred_ridge_train)
print('Train Set Mean Squared Error:', train_mse)
print('Train Set Mean Absolute Error:', train_mae)
print('Train Set R-squared:', train_r2)
Train Set Mean Squared Error: 89.76536510173793 Train Set Mean Absolute Error: 6.932282934455073 Train Set R-squared: 0.8206868256037074
# Predict the target variable on the test set
y_test_pred = ridge.predict(test_x)
# Plot the predictions against the actual reference values
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_test_pred, color='blue', alpha=0.5)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linestyle='--')
plt.xlabel('Actual Reference Values')
plt.ylabel('Predicted Reference Values')
plt.title('Actual vs. Predicted Reference Values (Test Set)')
plt.show()
# Fit an Elastic Net regression model
regr_en = ElasticNetCV(cv=5, alphas=[0.1, 0.3, 0.5, 0.7, 1.0])
regr_en.fit(train_x, train_y)
ElasticNetCV(alphas=[0.1, 0.3, 0.5, 0.7, 1.0], cv=5)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
ElasticNetCV(alphas=[0.1, 0.3, 0.5, 0.7, 1.0], cv=5)
# Evaluate the model on the testing set
y_pred_elastic_test = regr_en.predict(test_x)
mse = mean_squared_error(test_y, y_pred_elastic_test)
mae = mean_absolute_error(test_y, y_pred_elastic_test)
elastic_r2 = r2_score(test_y, y_pred_elastic_test)
print('Test Set Mean Squared Error:', mse)
print('Test Set Mean Absolute Error:', mae)
print('Test Set R-squared:',elastic_r2)
# Iterate and fine-tune the model as needed by adjusting PCA components, alpha, l1_ratio, or other parameters
# Evaluate the model's performance and repeat the process until satisfactory results are achieved.
Test Set Mean Squared Error: 87.75598081697953 Test Set Mean Absolute Error: 6.857716549194305 Test Set R-squared: 0.8232493439445739
# Predict the target variable on the training set
y_pred_elastic_train = regr_en.predict(train_x)
# Calculate evaluation metrics on the training set
train_mse = mean_squared_error(train_y, y_pred_elastic_train)
train_mae = mean_absolute_error(train_y, y_pred_elastic_train)
train_r2 = r2_score(train_y, y_pred_elastic_train)
print('Train Set Mean Squared Error:', train_mse)
print('Train Set Mean Absolute Error:', train_mae)
print('Train Set R-squared:', train_r2)
Train Set Mean Squared Error: 89.9141725526254 Train Set Mean Absolute Error: 6.914168265980384 Train Set R-squared: 0.8203895713524468
# Predict the target variable on the test set
y_test_pred = regr_en.predict(test_x)
# Plot the predictions against the actual reference values
plt.figure(figsize=(10, 6))
plt.scatter(test_y, y_test_pred, color='blue', alpha=0.5)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linestyle='--')
plt.xlabel('Actual Reference Values')
plt.ylabel('Predicted Reference Values')
plt.title('Actual vs. Predicted Reference Values (Test Set)')
plt.show()
# Split the data into training and testing sets (using the PCA vectors)
X_train, X_test, y_train, y_test = train_test_split(pca_vectors, df_y, test_size=0.3, random_state=42)
# Create the Decision Tree Regression model
dt_regressor = DecisionTreeRegressor(random_state=42)
# Fit the model on the training data
dt_regressor.fit(X_train, y_train)
# Predict the target variable on the testing set
y_pred_decision_test = dt_regressor.predict(X_test)
# Evaluate the model's performance on the testing set
mse = mean_squared_error(y_test, y_pred_decision_test)
mae = mean_absolute_error(y_test, y_pred_decision_test)
decision_r2 = r2_score(y_test, y_pred_decision_test)
print('Test Set Mean Squared Error:', mse)
print('Test Set Mean Absolute Error:', mae)
print('Test Set R-squared:', decision_r2)
Test Set Mean Squared Error: 12.940707700156338 Test Set Mean Absolute Error: 0.7184258982866043 Test Set R-squared: 0.97393592374525
# Predict the target variable on the training set
y_pred_decision_train = dt_regressor.predict(X_train)
# Calculate evaluation metrics on the training set
train_mse = mean_squared_error(y_train, y_pred_decision_train)
train_mae = mean_absolute_error(y_train, y_pred_decision_train)
train_r2 = r2_score(y_train, y_pred_decision_train)
print('Train Set Mean Squared Error:', train_mse)
print('Train Set Mean Absolute Error:', train_mae)
print('Train Set R-squared:', train_r2)
Train Set Mean Squared Error: 1.979743065754344e-06 Train Set Mean Absolute Error: 1.0282376502002683e-05 Train Set R-squared: 0.999999996045312
# Predict the target variable on the test set
y_test_pred = dt_regressor.predict(X_test)
# Plot the predictions against the actual reference values
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_test_pred, color='blue', alpha=0.5)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linestyle='--')
plt.xlabel('Actual Reference Values')
plt.ylabel('Predicted Reference Values')
plt.title('DT Regressor: Actual vs. Predicted Reference Values')
plt.show()
with pca vector models
PCA input data goes to linear,ridge, elastic and decision tree model.
meta_train_features = np.column_stack((y_pred_linear_train_pca, y_pred_ridge_train,
y_pred_elastic_train, y_pred_decision_train))
# Define the parameter grid
param_dist = {
'learning_rate': [0.01, 0.1, 0.2],
'max_depth': [3, 4, 5, 6],
'n_estimators': [100, 200, 300],
'subsample': [0.8, 0.9, 1.0],
'colsample_bytree': [0.8, 0.9, 1.0],
'gamma': [0, 1, 5],
'min_child_weight': [1, 5, 10],
'reg_alpha': [0, 1e-5, 1e-2, 0.1, 1],
'reg_lambda': [0, 1e-5, 1e-2, 0.1, 1]
}
# Initialize the XGBoost regressor
xgb_model = xgb.XGBRegressor()
# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(
xgb_model, param_distributions=param_dist,
n_iter=10, scoring='neg_mean_squared_error',
cv=5, verbose=1, random_state=42
)
# Perform the randomized search
random_search.fit(meta_train_features, train_y)
# Get the best hyperparameters and score
best_params = random_search.best_params_
best_score = random_search.best_score_
print("Best Hyperparameters:", best_params)
print("\nBest Score (Negative MSE):", best_score)
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best Hyperparameters: {'subsample': 0.9, 'reg_lambda': 0, 'reg_alpha': 1, 'n_estimators': 300, 'min_child_weight': 1, 'max_depth': 4, 'learning_rate': 0.2, 'gamma': 0, 'colsample_bytree': 1.0}
Best Score (Negative MSE): -0.003914909703952625
for key, value in best_params.items():
print(f"{key}: {value}")
subsample: 0.9 reg_lambda: 0 reg_alpha: 1 n_estimators: 300 min_child_weight: 1 max_depth: 4 learning_rate: 0.2 gamma: 0 colsample_bytree: 1.0
# Meta-Model Selection and Training
# Use the best hyperparameters obtained from hyperparameter tuning
best_subsample = 0.9
best_reg_lambda = 0
best_reg_alpha = 1
best_n_estimators = 300
best_min_child_weight = 1
best_max_depth = 4
best_learning_rate = 0.2
best_gamma = 0
best_colsample_bytree = 1.0
meta_model = xgb.XGBRegressor(
subsample=best_subsample,
reg_lambda=best_reg_lambda,
reg_alpha=best_reg_alpha,
n_estimators=best_n_estimators,
min_child_weight=best_min_child_weight,
max_depth=best_max_depth,
learning_rate=best_learning_rate,
gamma=best_gamma,
colsample_bytree=best_colsample_bytree,
random_state=42
)
meta_model.fit(meta_train_features, train_y)
XGBRegressor(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=1.0, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=0, gpu_id=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=0.2, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=4, max_leaves=None,
min_child_weight=1, missing=nan, monotone_constraints=None,
n_estimators=300, n_jobs=None, num_parallel_tree=None,
predictor=None, random_state=42, ...)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. XGBRegressor(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=1.0, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=0, gpu_id=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=0.2, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=4, max_leaves=None,
min_child_weight=1, missing=nan, monotone_constraints=None,
n_estimators=300, n_jobs=None, num_parallel_tree=None,
predictor=None, random_state=42, ...)# Evaluation on testing set
meta_test_features = np.column_stack((y_pred_linear_test_pca, y_pred_ridge_test,
y_pred_elastic_test, y_pred_decision_test))
ensemble_test_predictions = meta_model.predict(meta_test_features)
ensemble_mse = mean_squared_error(test_y, ensemble_test_predictions)
print("Test Set Ensemble MSE:", ensemble_mse)
ensemble_r2 = r2_score(test_y, ensemble_test_predictions)
print("Test Set Ensemble R-squared:", ensemble_r2)
Test Set Ensemble MSE: 12.943920760380433 Test Set Ensemble R-squared: 0.9739294522717701
# Evaluation on training set
meta_train_features = np.column_stack((y_pred_linear_train, y_pred_ridge_train,
y_pred_elastic_train, y_pred_decision_train))
ensemble_train_predictions = meta_model.predict(meta_train_features)
ensemble_mse = mean_squared_error(train_y, ensemble_train_predictions)
print("Train Set Ensemble MSE:", ensemble_mse)
ensemble_r2 = r2_score(train_y, ensemble_train_predictions)
print("Train Set Ensemble R-squared:", ensemble_r2)
Train Set Ensemble MSE: 0.0033983004727524296 Train Set Ensemble R-squared: 0.9999932116351933
# Plotting prediction vs actual values
plt.scatter(test_y, ensemble_test_predictions)
plt.xlabel("Actual Values")
plt.ylabel("Predicted Values")
plt.title("Actual vs. Predicted Values")
plt.show()
residuals = test_y - ensemble_test_predictions
plt.scatter(test_y, residuals)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel("Actual Values")
plt.ylabel("Residuals")
plt.title("Residual Plot")
plt.show()
# List of R-squared values for each model
r2_values = [ensemble_r2, linear_r2, ridge_r2, elastic_r2, decision_r2]
# List of model names
model_names = ['Ensemble', 'Linear', 'Ridge', 'Elastic Net', 'Decision Tree']
# Create a bar chart
plt.figure(figsize=(10, 6))
plt.bar(model_names, r2_values, color=['blue', 'green', 'orange', 'red', 'purple'])
plt.title('Comparison of R-squared Values for Different Models')
plt.xlabel('Model')
plt.ylabel('R-squared Value')
plt.ylim(0, 1)
plt.show()
LIME (Local Interpretable Model-agnostic Explanations)
# Create a LimeTabularExplainer instance
explainer = lime_tabular.LimeTabularExplainer(
training_data=meta_test_features, # The training dataset used for generating local explanations
feature_names=["linearModel", "ridgeModel", "elasticModel", "decisionTree"], # Names of the features/columns in the dataset
mode="regression" # Specify mode as "regression" for regression problems
)
# Explain an instance using LIME
explain_lime = explainer.explain_instance(
meta_test_features[20], # The instance to be explained (in this case, index 20 of the dataset)
meta_model.predict, # The prediction function of the meta-model
num_features=4 # Number of top features to show in the explanation
)
# Display the LIME explanation in a notebook
explain_lime.show_in_notebook()
k-fold Cross Validation
# Perform cross-validation and obtain predictions using cross_val_predict
ensemble_train_predictions = cross_val_predict(meta_model, meta_train_features, train_y, cv=5)
# Calculate cross-validation metrics
ensemble_mse_train_cv = mean_squared_error(train_y, ensemble_train_predictions)
ensemble_r2_train_cv = r2_score(train_y, ensemble_train_predictions)
print("Cross-Validation Train Set Ensemble MSE:", ensemble_mse_train_cv)
print("Cross-Validation Train Set Ensemble R-squared:", ensemble_r2_train_cv)
Cross-Validation Train Set Ensemble MSE: 0.00402036325103711 Cross-Validation Train Set Ensemble R-squared: 0.9999919690172713
# Data Preparation: Splitting dataset into training, validation, and test sets.
X_train, X_temp, y_train, y_temp = train_test_split(df_x, df_y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
# Train Base Models
linear_reg = LinearRegression()
ridge_reg = Ridge(alpha=1.0)
elastic_net = ElasticNetCV(cv=5)
decision_tree = DecisionTreeRegressor(max_depth=5)
linear_reg.fit(X_train, y_train)
ridge_reg.fit(X_train, y_train)
elastic_net.fit(X_train, y_train)
decision_tree.fit(X_train, y_train)
DecisionTreeRegressor(max_depth=5)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
DecisionTreeRegressor(max_depth=5)
# Collect Predictions on Validation Set
val_predictions_linear = linear_reg.predict(X_val)
val_predictions_ridge = ridge_reg.predict(X_val)
val_predictions_elastic = elastic_net.predict(X_val)
val_predictions_tree = decision_tree.predict(X_val)
# Using XGBoost as the meta-model
meta_features = np.column_stack((val_predictions_linear, val_predictions_ridge,
val_predictions_elastic, val_predictions_tree))
# Define the parameter grid
param_dist = {
'learning_rate': [0.01, 0.1, 0.2],
'max_depth': [3, 4, 5, 6],
'n_estimators': [100, 200, 300],
'subsample': [0.8, 0.9, 1.0],
'colsample_bytree': [0.8, 0.9, 1.0]
}
# Initialize the XGBoost regressor
xgb_model = xgb.XGBRegressor()
# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(
xgb_model, param_distributions=param_dist,
n_iter=10, scoring='neg_mean_squared_error',
cv=5, verbose=1, random_state=42
)
# Perform the randomized search
random_search.fit(meta_features, y_val)
# Get the best hyperparameters and score
best_params = random_search.best_params_
best_score = random_search.best_score_
print("Best Hyperparameters:", best_params)
print("Best Score (Negative MSE):", best_score)
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best Hyperparameters: {'subsample': 1.0, 'n_estimators': 200, 'max_depth': 4, 'learning_rate': 0.1, 'colsample_bytree': 1.0}
Best Score (Negative MSE): -36.0905718275611
# Meta-Model Selection and Training
# Use the best hyperparameters obtained from hyperparameter tuning
best_subsample = 1.0
best_n_estimators = 200
best_max_depth = 4
best_learning_rate = 0.1
best_colsample_bytree = 1.0
meta_model = xgb.XGBRegressor(
subsample=best_subsample,
n_estimators=best_n_estimators,
max_depth=best_max_depth,
learning_rate=best_learning_rate,
colsample_bytree=best_colsample_bytree,
random_state=42
)
meta_model.fit(meta_features, y_val)
XGBRegressor(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=1.0, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=0.1, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=4, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
n_estimators=200, n_jobs=None, num_parallel_tree=None,
predictor=None, random_state=42, ...)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. XGBRegressor(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=1.0, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=0.1, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=4, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
n_estimators=200, n_jobs=None, num_parallel_tree=None,
predictor=None, random_state=42, ...)# Predict with the Ensemble on the Test Set
test_predictions_linear = linear_reg.predict(X_test)
test_predictions_ridge = ridge_reg.predict(X_test)
test_predictions_elastic = elastic_net.predict(X_test)
test_predictions_tree = decision_tree.predict(X_test)
meta_test_features = np.column_stack((test_predictions_linear, test_predictions_ridge,
test_predictions_elastic, test_predictions_tree))
ensemble_predictions = meta_model.predict(meta_test_features)
# Evaluation on testing set
ensemble_mse = mean_squared_error(y_test, ensemble_predictions)
print("Test Set Ensemble MSE:", ensemble_mse)
ensemble_r2 = r2_score(y_test, ensemble_predictions)
print("Test Set Ensemble R-squared:", ensemble_r2)
Test Set Ensemble MSE: 38.6631255329587 Test Set Ensemble R-squared: 0.9219275233928775
plt.scatter(y_test, ensemble_predictions)
plt.xlabel("Actual Values")
plt.ylabel("Predicted Values")
plt.title("Actual vs. Predicted Values")
plt.show()
residuals = y_test - ensemble_predictions
plt.scatter(y_test, residuals)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel("Actual Values")
plt.ylabel("Residuals")
plt.title("Residual Plot")
plt.show()
LIME (Local Interpretable Model-agnostic Explanations)
explainer = lime_tabular.LimeTabularExplainer(training_data=meta_features, feature_names=["linearModel", "ridgeModel", "elasticModel", "decisionTree"], mode="regression")
explain_lime = explainer.explain_instance(meta_features[10], meta_model.predict, num_features=4)
explain_lime.show_in_notebook()